library(plyr)
library(tidyverse)
library(stringr)
library(RCurl)
library(ipumsr)
library(brms)
library(parallel)
library(haven)

download.function <- function(stnum, url, file.root) {
  file <- paste0(file.root,stnum,".csv")
  tf <- paste0(tempfile(),".csv")
  download.file(paste0(url, file), tf)
  return(read_csv(tf))
}

#############################
##LOADING & FORMATTING DATA##
#############################

#Set working directory to local data folder#

workingDirectory <- "/Users/ericmcghee/Dropbox/PPIC/UCF/Post-election Covid/academic paper/data"

setwd(workingDirectory)

#1990 Census CVAP counts:  ICPSR#
dict <- read_csv("06054-0002-Data_dictionary_geoids.csv") %>%
  mutate(end=start+width-1)
starts <- c(dict$start, 1)
ends <- c(dict$end,NA)
d <- read_fwf("06054-0001-Data.txt",
              col_positions=fwf_positions(start=starts, 
                                          end=ends,
                                          col_names=c(dict$varname,"ALL_TABLES")))
d1 <- filter(d, LOGRECPN %in% "0001")
output <- file("06054-0001-Data-seg1.txt")
writeLines(d1$ALL_TABLES, output)
close(output)
d2 <- filter(d, LOGRECPN %in% "0002")
output <- file("06054-0001-Data-seg2.txt")
writeLines(d1$ALL_TABLES, output)
close(output)
d3 <- filter(d, LOGRECPN %in% "0003")
output <- file("06054-0001-Data-seg3.txt")
writeLines(d1$ALL_TABLES, output)
close(output)
d4 <- filter(d, LOGRECPN %in% "0004")
output <- file("06054-0001-Data-seg4.txt")
writeLines(d1$ALL_TABLES, output)
close(output)

dict <- read_csv("06054-0002-Data_dictionary_geoids.csv") %>%
  mutate(end=start+width-1)
starts <- c(dict$start, seq(7843,7897,by=9)[-7])
ends <- c(dict$end, seq(7843,7897,by=9)[-1]-1)
columns <- c(dict$varname, "u18_usborn","u18_nat","u18_ncit","adult_usborn",
             "adult_nat","adult_ncit")
census_90 <- read_fwf("06054-0001-Data-seg1.txt",
                      col_positions=fwf_positions(start=starts,
                                                  end=ends,
                                                  col_names=columns)) %>%
  filter(SUMLEV %in% "050") %>%
  dplyr::select(STUSAB,STATEFP,CNTY,ANPSADPI,u18_usborn:adult_ncit) %>%
  mutate(u18_usborn=as.integer(u18_usborn),
         u18_nat=as.integer(u18_nat),
         u18_ncit=as.integer(u18_ncit),
         adult_usborn=as.integer(adult_usborn),
         adult_nat=as.integer(adult_nat),
         adult_ncit=as.integer(adult_ncit))
names(census_90) <- tolower(names(census_90))
census_90 <- mutate(census_90, 
                    totpop=u18_usborn+u18_nat+u18_ncit+adult_usborn+adult_nat+adult_ncit,
                    vap=adult_usborn+adult_nat+adult_ncit,
                    cvap=adult_usborn+adult_nat,
                    year=1990) %>%
  dplyr::rename(stpost=stusab, stfips=statefp, cntyfips=cnty, cntynm=anpsadpi) %>%
  dplyr::select(-c(u18_usborn:adult_ncit))

#Census CVAP 2000#
files <- unzip("stp_76.zip", list=T)$Name[3:53]
starts <- c(2,6,8,33,255,263)
ends <- c(4,7,10,45,262,270)
names <- c("sumlev","stfips","cntyfips","cit","totpop","vap")
census_00 <- lapply(files, function(x) 
  read_fwf(unzip("stp_76.zip", file=x), 
           col_positions=fwf_positions(start=starts, end=ends, col_names=names)))
census_00 <- ldply(census_00) %>%
  filter(sumlev %in% "050")
census_00 <- left_join(filter(census_00, cit %in% "Total") %>%
                         dplyr::select(-c(sumlev,cit)),
                       filter(census_00, cit %in% "Citizen") %>% 
                         dplyr::rename(cvap=vap) %>%
                         dplyr::select(-c(sumlev,cit,totpop)),
                       by=c("stfips","cntyfips")) %>%
  mutate(year=2000)

#Census estimates 2010-2020#

url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/"
file.root <- "CC-EST2020-AGESEX-"
nums <- 1:56
nums <- nums[!nums %in% c(3,7,14,43,52)]
nums <- str_pad(nums, side="left", width=2, pad="0")
census_1020 <- ldply(lapply(nums, function(x,y,z) download.function(x,y,z), url, file.root)) %>%
  dplyr::select(STATE:POPESTIMATE,AGE18PLUS_TOT) %>%
  filter(!YEAR %in% c(1,2)) %>%
  mutate(YEAR=YEAR+2007) %>%
  dplyr::rename(stfips=STATE, cntyfips=COUNTY, statenm=STNAME, cntynm=CTYNAME,
                year=YEAR, totpop=POPESTIMATE, vap=AGE18PLUS_TOT)

cdc_1020 <- read_fwf(unzip("pcen_v2020_y1020_txt.zip"),
                     col_positions=fwf_widths(c(4,2,3,2,1,1,rep(8,12)),
                                              col_names=c("series","stfips","cntyfips","age","race",
                                                          "hisp","apr10","jul10","jul11","jul12","jul13",
                                                          "jul14","jul15","jul16","jul17","jul18","jul19",
                                                          "jul20"))) 

#ACS CVAP estimates 2009-2019#

acs_09 <- read_sas(unzip("CVAP_2005-2009_ACS_sas_files.zip", "CVAP Files/county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2009) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_10 <- read_sas(unzip("CVAP_2006-2010_ACS_sas_files.zip", "CVAP Files/county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2010) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_11 <- read_sas(unzip("CVAP_2007-2011_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2011) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_12 <- read_sas(unzip("CVAP_2008-2012_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2012) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_13 <- read_sas(unzip("CVAP_2009-2013_ACS_sas_files.zip", "County.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2013) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_14 <- read_sas(unzip("CVAP_2010-2014_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2014) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_15 <- read_sas(unzip("CVAP_2011-2015_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2015) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_16 <- read_sas(unzip("CVAP_2012-2016_ACS_sas_files.zip", "County.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2016) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_17 <- read_sas(unzip("CVAP_2013-2017_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(LNTITLE %in% "Total") %>%
  dplyr::select(GEONAME, GEOID, TOT_EST, ADU_EST, CVAP_EST) %>%
  mutate(year=2017) %>%
  dplyr::rename(cntynm=GEONAME, geoid=GEOID, totpop=TOT_EST, vap=ADU_EST, cvap=CVAP_EST)

acs_18 <- read_sas(unzip("CVAP_2014-2018_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(lntitle %in% "Total") %>%
  dplyr::select(geoname, geoid, tot_est, adu_est, cvap_est) %>%
  mutate(year=2018) %>%
  dplyr::rename(cntynm=geoname, geoid=geoid, totpop=tot_est, vap=adu_est, cvap=cvap_est)

acs_19 <- read_sas(unzip("CVAP_2015-2019_ACS_sas_files.zip", "county.sas7bdat")) %>%
  filter(lntitle %in% "Total") %>%
  dplyr::select(geoname, geoid, tot_est, adu_est, cvap_est) %>%
  mutate(year=2019) %>%
  dplyr::rename(cntynm=geoname, geoid=geoid, totpop=tot_est, vap=adu_est, cvap=cvap_est)

acs_0919 <- rbind.fill(acs_09,acs_10,acs_11,acs_12,acs_13,acs_14,acs_15,acs_16,
                       acs_17,acs_18,acs_19) %>%
  mutate(stfips=str_sub(geoid, 8, 9), cntyfips=str_sub(geoid,10,12),
         cntynm2=cntynm,
         cntynm=sapply(str_split(cntynm2, ","), "[", 1),
         statenm=sapply(str_split(cntynm2, ","), "[", 2)) %>%
  dplyr::select(-c(geoid, cntynm2))

#CDC VAP Estimates https://www.cdc.gov/nchs/nvss/bridged_race/data_documentation.htm#

cdc_9099 <- lapply(1:4, function(i) 
  read_fwf(paste0("icenA1_",i,".txt"),
           col_positions=fwf_widths(c(2,3,2,1,1,rep(8,10)),
                                    col_names=c("stfips","cntyfips","age","race",
                                                "hisp","jul90","jul91","jul92","jul93",
                                                "jul94","jul95","jul96","jul97","jul98",
                                                "jul99")))) %>%
  ldply(.) %>%
  pivot_longer(jul90:jul99, names_to="year", values_to="totpop") %>%
  mutate(adult=as.integer(age>=18)*totpop) %>%
  group_by(year, stfips, cntyfips) %>%
  dplyr::summarize(totpop=sum(totpop), vap=sum(adult)) %>%
  mutate(year=as.integer(str_replace(year,"jul","19")))

cdc_0004 <- read_fwf(unzip("icen_2000_09_y0004.zip"),
                     col_positions=fwf_widths(c(8,2,3,2,1,1,rep(8,5)),
                                              col_names=c("series","stfips","cntyfips","age","race",
                                                          "hisp","jul00","jul01","jul02","jul03",
                                                          "jul04"))) %>%
  pivot_longer(jul00:jul04, names_to="year", values_to="totpop") %>%
  mutate(adult=as.integer(age>=18)*totpop) %>%
  group_by(year, stfips, cntyfips) %>%
  dplyr::summarize(totpop=sum(totpop), vap=sum(adult)) %>%
  mutate(year=as.integer(str_replace(year,"jul","20")))

cdc_0509 <- read_fwf(unzip("icen_2000_09_y0509.zip"),
                     col_positions=fwf_widths(c(8,2,3,2,1,1,rep(8,5)),
                                              col_names=c("series","stfips","cntyfips","age","race",
                                                          "hisp","jul05","jul06","jul07","jul08",
                                                          "jul09"))) %>%
  pivot_longer(jul05:jul09, names_to="year", values_to="totpop") %>%
  mutate(adult=as.integer(age>=18)*totpop) %>%
  group_by(year, stfips, cntyfips) %>%
  dplyr::summarize(totpop=sum(totpop), vap=sum(adult)) %>%
  mutate(year=as.integer(str_replace(year,"jul","20")))

cdc_0009 <- rbind.fill(cdc_0004, cdc_0509)

cdc_1020 <- read_fwf(unzip("pcen_v2020_y1020_txt.zip"),
                     col_positions=fwf_widths(c(4,2,3,2,1,1,rep(8,12)),
                                              col_names=c("series","stfips","cntyfips","age","race",
                                                          "hisp","apr10","jul10","jul11","jul12","jul13",
                                                          "jul14","jul15","jul16","jul17","jul18","jul19",
                                                          "jul20"))) %>%
  pivot_longer(apr10:jul20, names_to="year", values_to="totpop") %>%
  filter(!year %in% "apr10") %>%
  mutate(adult=as.integer(age>=18)*totpop) %>%
  group_by(year, stfips, cntyfips) %>%
  dplyr::summarize(totpop=sum(totpop), vap=sum(adult)) %>%
  mutate(year=as.integer(str_replace(year,"jul","20")))

#putting the sources together#
d <- rbind.fill(cdc_9099, cdc_0009, cdc_1020) %>%
  left_join(census_90, by=c("year","stfips","cntyfips"), suffix=c("",".2")) %>%
  dplyr::select(-c(totpop.2,vap.2)) %>%
  left_join(census_00, by=c("year","stfips","cntyfips"), suffix=c("",".2")) %>%
  mutate(cvap=ifelse(year==2000, cvap.2, cvap)) %>%
  dplyr::select(-c(totpop.2,vap.2,cvap.2)) %>%
  left_join(acs_0919, by=c("year","stfips","cntyfips"), suffix=c("",".2")) %>%
  mutate(cvap=ifelse(year>=2009, cvap.2, cvap),
         cntynm=ifelse(year>=2009, cntynm.2, cntynm)) %>%
  dplyr::select(-c(cntynm.2,totpop.2,vap.2,cvap.2)) %>%
  mutate(geoid=paste0(stfips,cntyfips),
         vap_log=log(vap),
         cvap_log=log(cvap))

write.csv(d, "vap_cvap_1990_2020.csv")

#########
##MODEL##
#########
start <- proc.time()
m <- brm(bf(cvap_log ~ vap_log +
              (1 | geoid)), data=d,
         cores=detectCores(), chains=4,
         warmup=2000, iter=6000, refresh=50)
proc.time() - start

saveRDS(m, "cvap_imputation_model.rds")

#predicting from the model#
cvap_log_hat <- posterior_predict(m, newdata=d[1:10000,], allow_new_levels=T)
cvap_log_hat_mn1 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[10001:20000,], allow_new_levels=T)
cvap_log_hat_mn2 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[20001:30000,], allow_new_levels=T)
cvap_log_hat_mn3 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[30001:40000,], allow_new_levels=T)
cvap_log_hat_mn4 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[40001:50000,], allow_new_levels=T)
cvap_log_hat_mn5 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[50001:60000,], allow_new_levels=T)
cvap_log_hat_mn6 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[60001:70000,], allow_new_levels=T)
cvap_log_hat_mn7 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[70001:80000,], allow_new_levels=T)
cvap_log_hat_mn8 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[80001:90000,], allow_new_levels=T)
cvap_log_hat_mn9 <- apply(cvap_log_hat, 2, mean)
cvap_log_hat <- posterior_predict(m, newdata=d[90001:nrow(d),], allow_new_levels=T)
cvap_log_hat_mn10 <- apply(cvap_log_hat, 2, mean)
d$cvap_log_hat <- c(cvap_log_hat_mn1, cvap_log_hat_mn2, cvap_log_hat_mn3, cvap_log_hat_mn4,
                    cvap_log_hat_mn5, cvap_log_hat_mn6, cvap_log_hat_mn7, cvap_log_hat_mn8,
                    cvap_log_hat_mn9, cvap_log_hat_mn10)
rm(cvap_log_hat_mn1, cvap_log_hat_mn2, cvap_log_hat_mn3, cvap_log_hat_mn4,
     cvap_log_hat_mn5, cvap_log_hat_mn6, cvap_log_hat_mn7, cvap_log_hat_mn8,
     cvap_log_hat_mn9, cvap_log_hat_mn10)

write.csv(d, "vap_cvap_1990_2020_imputed.csv")




